{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "catholic-payroll",
   "metadata": {},
   "source": [
    "# ISAT demo using Landsat-8 and Sentinel-2 metadata\n",
    "# I-SAT has independent spatial join and temporal filtering\n",
    "## This demo could be used to find Spatio-temporal intersection between the two datasets and the potential area of interest at hour level"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "incorporated-characterization",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "3c804a58-035e-4a79-a94e-c74843fdd207",
   "metadata": {},
   "source": [
    "## Input date range for query, we have dataset collected in 2020 in this demo"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "contrary-margin",
   "metadata": {},
   "outputs": [],
   "source": [
    "# date in the format of \"yyy-MM-dd\", results include both start date and end date\n",
    "\n",
    "# The hour delay variable represents how many hours apart the two datasets are considered to be temporally intersected\n",
    "# E.g. if hour_delay = 6, for 2020-06-26 12:00:00, timestamps between 2020-06-26 06:00:00 and 2020-06-26 18:00:00 are considered to be temporally intersected\n",
    "start_date = '2018-01-01'\n",
    "end_date = '2018-06-30'\n",
    "duration_month = 6 # for export\n",
    "time_index_hour_length = 24\n",
    "hour_delay = 6"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "olive-balance",
   "metadata": {},
   "source": [
    "## please input the potential area of interest, we have \"Beaufort_Sea\" and \"Wandel_Sea\" in in this demo, could leave blank if not using AOI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "close-editor",
   "metadata": {},
   "outputs": [],
   "source": [
    "Paoi = 'Wandel_Sea'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "convenient-capability",
   "metadata": {},
   "source": [
    "## First, setup the Spark and Sedona environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "stuffed-anatomy",
   "metadata": {},
   "outputs": [],
   "source": [
    "import findspark\n",
    "#findspark.init() "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "binding-scholar",
   "metadata": {},
   "outputs": [],
   "source": [
    "SPARK_HOME='/opt/cloudera/parcels/CDH/lib/spark'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "magnetic-barbados",
   "metadata": {},
   "outputs": [],
   "source": [
    "findspark.init(SPARK_HOME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "neutral-taylor",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/zhangp/.conda/envs/py37_environment/lib/python3.7/site-packages/geopandas/_compat.py:58: UserWarning: The installed version of PyGEOS is too old (0.6 installed, 0.8 required), and thus GeoPandas will not use PyGEOS.\n",
      "  UserWarning,\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import os\n",
    "import codecs\n",
    "import subprocess\n",
    "#from hdfs import InsecureClient\n",
    "import numpy as np\n",
    "#from pyspark import SparkContext\n",
    "from pyspark import SQLContext\n",
    "from pyspark.sql import Row\n",
    "from pyspark.sql import functions as F\n",
    "from pyspark.sql.types import *\n",
    "import rtree\n",
    "from pyspark.sql import Window\n",
    "#import igraph\n",
    "#from igraph import Graph\n",
    "import geofeather"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "likely-logging",
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "from pyspark import StorageLevel\n",
    "import geopandas as gpd\n",
    "import pandas as pd\n",
    "from pyspark.sql.types import StructType\n",
    "from pyspark.sql.types import StructField\n",
    "from pyspark.sql.types import StringType\n",
    "from pyspark.sql.types import LongType\n",
    "from shapely.geometry import Point\n",
    "from shapely.geometry import Polygon\n",
    "\n",
    "from sedona.register import SedonaRegistrator\n",
    "from sedona.core.SpatialRDD import SpatialRDD\n",
    "from sedona.core.SpatialRDD import PointRDD\n",
    "from sedona.core.SpatialRDD import PolygonRDD\n",
    "from sedona.core.SpatialRDD import LineStringRDD\n",
    "from sedona.core.enums import FileDataSplitter\n",
    "from sedona.utils.adapter import Adapter\n",
    "from sedona.core.spatialOperator import KNNQuery\n",
    "from sedona.core.spatialOperator import JoinQuery\n",
    "from sedona.core.spatialOperator import JoinQueryRaw\n",
    "from sedona.core.spatialOperator import RangeQuery\n",
    "from sedona.core.spatialOperator import RangeQueryRaw\n",
    "from sedona.core.formatMapper.shapefileParser import ShapefileReader\n",
    "from sedona.core.formatMapper import WkbReader\n",
    "from sedona.core.formatMapper import WktReader\n",
    "from sedona.core.formatMapper import GeoJsonReader\n",
    "from sedona.sql.types import GeometryType\n",
    "from sedona.core.enums import GridType\n",
    "from sedona.core.SpatialRDD import RectangleRDD\n",
    "from sedona.core.enums import IndexType\n",
    "from sedona.core.geom.envelope import Envelope\n",
    "from sedona.utils import SedonaKryoRegistrator, KryoSerializer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "pressed-plant",
   "metadata": {},
   "outputs": [],
   "source": [
    "os.environ['PYSPARK_PYTHON'] = \"./environment/bin/python\"\n",
    "os.environ['YARN_CONF_DIR'] = \"/opt/cloudera/parcels/CDH/lib/spark/conf/yarn-conf\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "adult-study",
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = SparkSession \\\n",
    ".builder \\\n",
    ".appName(\"test_test_1\") \\\n",
    ".master('yarn') \\\n",
    ".config(\"spark.serializer\", KryoSerializer.getName) \\\n",
    ".config(\"spark.kryo.registrator\", SedonaKryoRegistrator.getName) \\\n",
    ".config('spark.jars','sedona-core-2.4_2.11-1.0.0-incubating.jar,sedona-sql-2.4_2.11-1.0.0-incubating.jar,sedona-python-adapter-2.4_2.11-1.0.0-incubating.jar,sedona-viz-2.4_2.11-1.0.0-incubating.jar,geotools-wrapper-geotools-24.0.jar') \\\n",
    ".config('spark.executor.memory', '20g') \\\n",
    ".config('spark.driver.memory', '10g') \\\n",
    ".config('spark.sql.shuffle.partitions', 6144) \\\n",
    ".config('spark.executor.instances', '24') \\\n",
    ".config('spark.executor.cores', '5') \\\n",
    ".config('spark.rpc.message.maxSize', '1024') \\\n",
    ".config('spark.yarn.dist.archives', 'environment.tar.gz#environment') \\\n",
    ".getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "unexpected-speaking",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SedonaRegistrator.registerAll(spark)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "offshore-purse",
   "metadata": {},
   "source": [
    "## Second, read datasets"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d14e2a78-1249-4a3d-8d65-4a1389e8635e",
   "metadata": {},
   "source": [
    "### Spark have different types of input methods, here we read from Hadoop file system, as our Spark 2.4 is built based on Hadoop Yarn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "minus-american",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_raw = spark.read.option(\"header\",True).option('inferSchema', True).options(delimiter='|').csv(\"meta_l8_2018_cloud_sep.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "passing-recipient",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_raw = spark.read.option(\"header\",True).option('inferSchema', True).options(delimiter='|').csv(\"meta_s2_2018_cloud_sep.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "split-relay",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_raw = spark.read.option(\"header\",True).option('inferSchema', True).csv(\"meta_pr_2016.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "relative-baking",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "155924"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l8_df_raw.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "unknown-universal",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "461000"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2_df_raw.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "marked-convertible",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_raw.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "republican-immigration",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "|_c0|        system:index|cloudcover|           timestamp|                .geo|\n",
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "|  0|LC08_001004_20180706|     58.52|2018-07-06 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  1|LC08_001004_20180722|     85.11|2018-07-22 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  2|LC08_001004_20180807|      0.08|2018-08-07 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  3|LC08_001004_20180908|      1.85|2018-09-08 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  4|LC08_001005_20180417|      4.23|2018-04-17 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  5|LC08_001005_20180706|     41.22|2018-07-06 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  6|LC08_001005_20180722|     42.02|2018-07-22 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  7|LC08_001005_20180807|       0.1|2018-08-07 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  8|LC08_001005_20180823|     66.75|2018-08-23 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  9|LC08_001005_20180908|     22.47|2018-09-08 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 10|LC08_001006_20180401|      3.01|2018-04-01 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 11|LC08_001006_20180417|      4.27|2018-04-17 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 12|LC08_001006_20180519|     36.96|2018-05-19 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 13|LC08_001006_20180604|      0.07|2018-06-04 14:07:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 14|LC08_001006_20180620|      1.37|2018-06-20 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 15|LC08_001006_20180706|      13.4|2018-07-06 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 16|LC08_001006_20180722|     43.27|2018-07-22 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 17|LC08_001006_20180807|      0.23|2018-08-07 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 18|LC08_001006_20180823|     52.62|2018-08-23 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 19|LC08_001006_20180908|     40.37|2018-09-08 14:08:...|\"{\"\"type\"\":\"\"Line...|\n",
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_raw.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "european-arrangement",
   "metadata": {},
   "outputs": [],
   "source": [
    "# l8_df_raw = l8_df_raw.filter(F.col('cloudcover') <= 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "declared-magnitude",
   "metadata": {},
   "outputs": [],
   "source": [
    "# l8_df_raw.filter(F.col('cloudcover') == 0).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "adult-destination",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "155924"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l8_df_raw.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "executive-angola",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "|_c0|        system:index|cloudcover|           timestamp|                .geo|\n",
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "|  0|20171226T094359_2...|     9.219|2018-02-02 11:37:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  1|20180101T082329_2...|    98.622|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  2|20180101T082329_2...|     100.0|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  3|20180101T082329_2...|   47.2689|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  4|20180101T082329_2...|   51.0769|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  5|20180101T082329_2...|   76.9772|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  6|20180101T082329_2...|   77.5421|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  7|20180101T082329_2...|   85.8672|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  8|20180101T082329_2...|   37.5433|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "|  9|20180101T082329_2...|   67.3636|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 10|20180101T082329_2...|   85.0339|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 11|20180101T082329_2...|   17.2088|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 12|20180101T082329_2...|   16.1119|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 13|20180101T082329_2...|    3.9833|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 14|20180101T082329_2...|   73.4103|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 15|20180101T082329_2...|   68.6584|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 16|20180101T082329_2...|   51.4231|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 17|20180101T082329_2...|   28.1423|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 18|20180101T082329_2...|   23.1729|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "| 19|20180101T082329_2...|    6.5445|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|\n",
      "+---+--------------------+----------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_raw.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "raising-samoa",
   "metadata": {},
   "outputs": [],
   "source": [
    "# s2_df_raw = s2_df_raw.filter(F.col('cloudcover') <= 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "legitimate-director",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "461000"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "s2_df_raw.count()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "completed-intent",
   "metadata": {},
   "source": [
    "### generate geo columns for all columns, using WKT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "yellow-giant",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_raw = s2_df_raw.withColumnRenamed('.geo', 'footprint')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "lesser-member",
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "def generate_WKT(GEE_coo_str):\n",
    "    if 'LinearRing' not in GEE_coo_str:\n",
    "        print('error geo type')\n",
    "        return 'unknown geo'\n",
    "    if re.search('(\\[\\[.+\\]\\])', GEE_coo_str):\n",
    "        gee_coo0 = re.search('(\\[\\[.+\\]\\])', GEE_coo_str).group(1)\n",
    "        test_com = re.compile('(\\[.+?)(\\,)(.+?\\])')\n",
    "        gee_coos_1 = test_com.sub(r'\\1 \\3', gee_coo0)\n",
    "        gee_coos_2 = gee_coos_1.replace('[', '').replace(']', '').replace(',', ', ')\n",
    "        return 'POLYGON ((' + gee_coos_2 + '))'\n",
    "    else:\n",
    "        return 'unknown format'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "nearby-firewall",
   "metadata": {},
   "outputs": [],
   "source": [
    "generate_WKT_udf = F.udf(generate_WKT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "uniform-reflection",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_raw = l8_df_raw.withColumnRenamed('.geo', 'footprint')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "flush-anthropology",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_wkt = l8_df_raw.withColumn('WKT_geo', generate_WKT_udf(F.col('footprint')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "productive-queue",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "|_c0|        system:index|cloudcover|           timestamp|           footprint|             WKT_geo|\n",
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "|  0|LC08_001004_20180706|     58.52|2018-07-06 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-10.924...|\n",
      "|  1|LC08_001004_20180722|     85.11|2018-07-22 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-16.038...|\n",
      "|  2|LC08_001004_20180807|      0.08|2018-08-07 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-12.596...|\n",
      "|  3|LC08_001004_20180908|      1.85|2018-09-08 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-13.429...|\n",
      "|  4|LC08_001005_20180417|      4.23|2018-04-17 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.204...|\n",
      "|  5|LC08_001005_20180706|     41.22|2018-07-06 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-15.823...|\n",
      "|  6|LC08_001005_20180722|     42.02|2018-07-22 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-20.743...|\n",
      "|  7|LC08_001005_20180807|       0.1|2018-08-07 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-25.727...|\n",
      "|  8|LC08_001005_20180823|     66.75|2018-08-23 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.249...|\n",
      "|  9|LC08_001005_20180908|     22.47|2018-09-08 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.547...|\n",
      "| 10|LC08_001006_20180401|      3.01|2018-04-01 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.492...|\n",
      "| 11|LC08_001006_20180417|      4.27|2018-04-17 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-21.456...|\n",
      "| 12|LC08_001006_20180519|     36.96|2018-05-19 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-29.341...|\n",
      "| 13|LC08_001006_20180604|      0.07|2018-06-04 14:07:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-29.095...|\n",
      "| 14|LC08_001006_20180620|      1.37|2018-06-20 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-22.339...|\n",
      "| 15|LC08_001006_20180706|      13.4|2018-07-06 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-29.103...|\n",
      "| 16|LC08_001006_20180722|     43.27|2018-07-22 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-29.090...|\n",
      "| 17|LC08_001006_20180807|      0.23|2018-08-07 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-19.801...|\n",
      "| 18|LC08_001006_20180823|     52.62|2018-08-23 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-24.993...|\n",
      "| 19|LC08_001006_20180908|     40.37|2018-09-08 14:08:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-22.364...|\n",
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_wkt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "cutting-stick",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_wkt = s2_df_raw.withColumn('WKT_geo', generate_WKT_udf(F.col('footprint')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "democratic-purse",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "|_c0|        system:index|cloudcover|           timestamp|           footprint|             WKT_geo|\n",
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "|  0|20171226T094359_2...|     9.219|2018-02-02 11:37:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((-6.5921...|\n",
      "|  1|20180101T082329_2...|    98.622|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((33.1015...|\n",
      "|  2|20180101T082329_2...|     100.0|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1301...|\n",
      "|  3|20180101T082329_2...|   47.2689|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1465...|\n",
      "|  4|20180101T082329_2...|   51.0769|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1574...|\n",
      "|  5|20180101T082329_2...|   76.9772|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.1594...|\n",
      "|  6|20180101T082329_2...|   77.5421|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.1689...|\n",
      "|  7|20180101T082329_2...|   85.8672|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.0543...|\n",
      "|  8|20180101T082329_2...|   37.5433|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.0638...|\n",
      "|  9|20180101T082329_2...|   67.3636|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((36.0089...|\n",
      "| 10|20180101T082329_2...|   85.0339|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.1079...|\n",
      "| 11|20180101T082329_2...|   17.2088|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((33.3628...|\n",
      "| 12|20180101T082329_2...|   16.1119|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1677...|\n",
      "| 13|20180101T082329_2...|    3.9833|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.1008...|\n",
      "| 14|20180101T082329_2...|   73.4103|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.0647...|\n",
      "| 15|20180101T082329_2...|   68.6584|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.0756...|\n",
      "| 16|20180101T082329_2...|   51.4231|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.0746...|\n",
      "| 17|20180101T082329_2...|   28.1423|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.3049...|\n",
      "| 18|20180101T082329_2...|   23.1729|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((34.6001...|\n",
      "| 19|20180101T082329_2...|    6.5445|2018-01-01 08:23:...|\"{\"\"type\"\":\"\"Line...|POLYGON ((35.3583...|\n",
      "+---+--------------------+----------+--------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_wkt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "lasting-locking",
   "metadata": {},
   "outputs": [],
   "source": [
    "# s2_df_raw_pd = s2_df_raw.limit(5).toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "twelve-height",
   "metadata": {},
   "outputs": [],
   "source": [
    "# s2_df_raw_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "coordinated-footwear",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_wkt = pr_df_raw.withColumn('WKT_geo', generate_WKT_udf(F.col('footprint')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "unique-slope",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "disabled-disclaimer",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_wkt = s2_df_wkt.withColumnRenamed('system:index', 's2_index')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "distributed-warrant",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- _c0: integer (nullable = true)\n",
      " |-- s2_index: string (nullable = true)\n",
      " |-- cloudcover: double (nullable = true)\n",
      " |-- timestamp: timestamp (nullable = true)\n",
      " |-- footprint: string (nullable = true)\n",
      " |-- WKT_geo: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_wkt.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "modern-wednesday",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- _c0: integer (nullable = true)\n",
      " |-- system:index: string (nullable = true)\n",
      " |-- cloudcover: double (nullable = true)\n",
      " |-- timestamp: timestamp (nullable = true)\n",
      " |-- footprint: string (nullable = true)\n",
      " |-- WKT_geo: string (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_wkt.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "twenty-carry",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_wkt = l8_df_wkt.withColumnRenamed('system:index', 'l8_index')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "stunning-adventure",
   "metadata": {},
   "source": [
    "### transfer to Sedona df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "id": "anonymous-bailey",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_wkt.createOrReplaceTempView(\"l8_df_wkt\")\n",
    "l8_df_sedona = spark.sql(\"select ST_GeomFromWKT(WKT_geo) as l8_geometry, l8_index as l8_img_id, timestamp as l8_timestep from l8_df_wkt\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "after-musical",
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate the area for the l8\n",
    "\n",
    "l8_df_sedona.createOrReplaceTempView(\"l8_df_sedona\")\n",
    "l8_df_sedona = spark.sql(\"select ST_Area(l8_geometry) as l8_area, * from l8_df_sedona\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "intended-wisdom",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+--------------------+--------------------+--------------------+\n",
      "|           l8_area|         l8_geometry|           l8_img_id|         l8_timestep|\n",
      "+------------------+--------------------+--------------------+--------------------+\n",
      "|14.124304956531168|POLYGON ((-10.924...|LC08_001004_20180706|2018-07-06 14:07:...|\n",
      "|  14.1368958808218|POLYGON ((-16.038...|LC08_001004_20180722|2018-07-22 14:07:...|\n",
      "| 14.13397280878489|POLYGON ((-12.596...|LC08_001004_20180807|2018-08-07 14:07:...|\n",
      "| 14.21783504836524|POLYGON ((-13.429...|LC08_001004_20180908|2018-09-08 14:07:...|\n",
      "|13.013235111990365|POLYGON ((-19.204...|LC08_001005_20180417|2018-04-17 14:08:...|\n",
      "|13.016815047842258|POLYGON ((-15.823...|LC08_001005_20180706|2018-07-06 14:07:...|\n",
      "| 13.04422889786937|POLYGON ((-20.743...|LC08_001005_20180722|2018-07-22 14:07:...|\n",
      "|13.025145367910945|POLYGON ((-25.727...|LC08_001005_20180807|2018-08-07 14:08:...|\n",
      "|13.088991981998086|POLYGON ((-19.249...|LC08_001005_20180823|2018-08-23 14:08:...|\n",
      "|13.036544425947993|POLYGON ((-19.547...|LC08_001005_20180908|2018-09-08 14:08:...|\n",
      "|11.996375308726677|POLYGON ((-19.492...|LC08_001006_20180401|2018-04-01 14:08:...|\n",
      "| 12.04368416424424|POLYGON ((-21.456...|LC08_001006_20180417|2018-04-17 14:08:...|\n",
      "|11.994548598119355|POLYGON ((-29.341...|LC08_001006_20180519|2018-05-19 14:08:...|\n",
      "|11.907405621268417|POLYGON ((-29.095...|LC08_001006_20180604|2018-06-04 14:07:...|\n",
      "|12.011768123945448|POLYGON ((-22.339...|LC08_001006_20180620|2018-06-20 14:08:...|\n",
      "|11.997177863781467|POLYGON ((-29.103...|LC08_001006_20180706|2018-07-06 14:08:...|\n",
      "|12.036923004713366|POLYGON ((-29.090...|LC08_001006_20180722|2018-07-22 14:08:...|\n",
      "|11.964717576376385|POLYGON ((-19.801...|LC08_001006_20180807|2018-08-07 14:08:...|\n",
      "|11.985447360963212|POLYGON ((-24.993...|LC08_001006_20180823|2018-08-23 14:08:...|\n",
      "|11.987771678599081|POLYGON ((-22.364...|LC08_001006_20180908|2018-09-08 14:08:...|\n",
      "+------------------+--------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "id": "velvet-animation",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_wkt.createOrReplaceTempView(\"s2_df_wkt\")\n",
    "s2_df_sedona = spark.sql(\"select ST_GeomFromWKT(WKT_geo) as s2_geometry, s2_index as s2_id, timestamp as s2_timestamp from s2_df_wkt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "id": "inner-diversity",
   "metadata": {},
   "outputs": [],
   "source": [
    "# calculate the area for the s2\n",
    "\n",
    "s2_df_sedona.createOrReplaceTempView(\"s2_df_sedona\")\n",
    "s2_df_sedona = spark.sql(\"select ST_Area(s2_geometry) as s2_area, * from s2_df_sedona\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "id": "prerequisite-copying",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|\n",
      "+--------------------+--------------------+--------------------+--------------------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|\n",
      "+--------------------+--------------------+--------------------+--------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "essential-constant",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "advance-transportation",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_wkt.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "controlling-semiconductor",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_wkt.createOrReplaceTempView(\"pr_df_wkt\")\n",
    "# pr_df_sedona = spark.sql(\"select ST_GeomFromWKT(WKT_geo) as pr_geometry, img_id as pr_id, timestep as pr_timestep from pr_df_wkt\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "direct-control",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pr_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "upset-concentrate",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "frequent-construction",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "cleared-essay",
   "metadata": {},
   "source": [
    "### prepare timestamps for temporal join"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "stock-bonus",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_sedona = l8_df_sedona.withColumn('l8_timestamp_date', F.to_timestamp(F.col('l8_timestep'), 'yyyy-MM-dd HH:mm:ss'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "id": "threaded-lloyd",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|           l8_area|         l8_geometry|           l8_img_id|         l8_timestep|  l8_timestamp_date|\n",
      "+------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|14.124304956531168|POLYGON ((-10.924...|LC08_001004_20180706|2018-07-06 14:07:...|2018-07-06 14:07:29|\n",
      "|  14.1368958808218|POLYGON ((-16.038...|LC08_001004_20180722|2018-07-22 14:07:...|2018-07-22 14:07:36|\n",
      "| 14.13397280878489|POLYGON ((-12.596...|LC08_001004_20180807|2018-08-07 14:07:...|2018-08-07 14:07:44|\n",
      "| 14.21783504836524|POLYGON ((-13.429...|LC08_001004_20180908|2018-09-08 14:07:...|2018-09-08 14:07:58|\n",
      "|13.013235111990365|POLYGON ((-19.204...|LC08_001005_20180417|2018-04-17 14:08:...|2018-04-17 14:08:04|\n",
      "|13.016815047842258|POLYGON ((-15.823...|LC08_001005_20180706|2018-07-06 14:07:...|2018-07-06 14:07:52|\n",
      "| 13.04422889786937|POLYGON ((-20.743...|LC08_001005_20180722|2018-07-22 14:07:...|2018-07-22 14:07:59|\n",
      "|13.025145367910945|POLYGON ((-25.727...|LC08_001005_20180807|2018-08-07 14:08:...|2018-08-07 14:08:08|\n",
      "|13.088991981998086|POLYGON ((-19.249...|LC08_001005_20180823|2018-08-23 14:08:...|2018-08-23 14:08:16|\n",
      "|13.036544425947993|POLYGON ((-19.547...|LC08_001005_20180908|2018-09-08 14:08:...|2018-09-08 14:08:22|\n",
      "|11.996375308726677|POLYGON ((-19.492...|LC08_001006_20180401|2018-04-01 14:08:...|2018-04-01 14:08:37|\n",
      "| 12.04368416424424|POLYGON ((-21.456...|LC08_001006_20180417|2018-04-17 14:08:...|2018-04-17 14:08:28|\n",
      "|11.994548598119355|POLYGON ((-29.341...|LC08_001006_20180519|2018-05-19 14:08:...|2018-05-19 14:08:08|\n",
      "|11.907405621268417|POLYGON ((-29.095...|LC08_001006_20180604|2018-06-04 14:07:...|2018-06-04 14:07:57|\n",
      "|12.011768123945448|POLYGON ((-22.339...|LC08_001006_20180620|2018-06-20 14:08:...|2018-06-20 14:08:07|\n",
      "|11.997177863781467|POLYGON ((-29.103...|LC08_001006_20180706|2018-07-06 14:08:...|2018-07-06 14:08:16|\n",
      "|12.036923004713366|POLYGON ((-29.090...|LC08_001006_20180722|2018-07-22 14:08:...|2018-07-22 14:08:23|\n",
      "|11.964717576376385|POLYGON ((-19.801...|LC08_001006_20180807|2018-08-07 14:08:...|2018-08-07 14:08:32|\n",
      "|11.985447360963212|POLYGON ((-24.993...|LC08_001006_20180823|2018-08-23 14:08:...|2018-08-23 14:08:40|\n",
      "|11.987771678599081|POLYGON ((-22.364...|LC08_001006_20180908|2018-09-08 14:08:...|2018-09-08 14:08:46|\n",
      "+------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "id": "awful-agriculture",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_sedona = s2_df_sedona.withColumn('s2_timestamp_date', F.to_timestamp(F.col('s2_timestamp'), 'yyyy-MM-dd HH:mm:ss'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "id": "sound-egyptian",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|  s2_timestamp_date|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|2018-02-02 11:37:55|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "s2_df_sedona.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "brief-difference",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "id": "devoted-jimmy",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Extract data within potential time slots\n",
    "import time\n",
    "import datetime\n",
    "\n",
    "start_timestamp = time.mktime(datetime.datetime.strptime(start_date, \"%Y-%m-%d\").timetuple())\n",
    "start_timestamp = start_timestamp - hour_delay * 3600 # for potential time delay\n",
    "\n",
    "end_timestamp = time.mktime(datetime.datetime.strptime(end_date, \"%Y-%m-%d\").timetuple())\n",
    "end_timestamp = end_timestamp + hour_delay * 3600 # for potential time delay"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "decent-clearing",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "77209"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l8_df_sedona = l8_df_sedona.filter(F.unix_timestamp(F.col('l8_timestep')) <= end_timestamp).filter(F.unix_timestamp(F.col('l8_timestep')) >= start_timestamp)\n",
    "l8_df_sedona.createOrReplaceTempView(\"l8_df_sedona\")\n",
    "\n",
    "## Show the schema of the table\n",
    "spark.table(\"l8_df_sedona\").count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "graduate-edinburgh",
   "metadata": {},
   "outputs": [],
   "source": [
    "s2_df_sedona = s2_df_sedona.filter(F.unix_timestamp(F.col('s2_timestamp')) <= end_timestamp).filter(F.unix_timestamp(F.col('s2_timestamp')) >= start_timestamp)\n",
    "s2_df_sedona.createOrReplaceTempView(\"s2_df_sedona\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "id": "backed-going",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1514782800.0"
      ]
     },
     "execution_count": 56,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# try to generate a temporal index\n",
    "\n",
    "# find start timestamp\n",
    "import datetime\n",
    "import time\n",
    " \n",
    "# assigned regular string date\n",
    "start_date_datetype = datetime.datetime(2018, 1, 1, 0, 0)\n",
    "# print(\"date_time =>\",date_time)\n",
    " \n",
    "start_date_timestamp = time.mktime(start_date_datetype.timetuple())\n",
    "start_date_timestamp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "id": "contrary-priority",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get time_duration and hour_delay into seconds for comparison\n",
    "time_index_hour_length_second = time_index_hour_length * 3600\n",
    "hour_delay_second = hour_delay * 3600"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "id": "subsequent-encoding",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "86400"
      ]
     },
     "execution_count": 58,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "time_index_hour_length_second"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "insured-muslim",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_time_index = s2_df_sedona.withColumn('time_index', F.floor((F.unix_timestamp('s2_timestamp_date') - F.lit(start_date_timestamp)) / F.lit(time_index_hour_length_second)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "banner-aggregate",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|  s2_timestamp_date|time_index|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|2018-02-02 11:37:55|        32|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_time_index.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "bigger-browser",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_time_index_early = sentinel_sedona_time_index.filter(((F.unix_timestamp('s2_timestamp_date') - F.lit(start_date_timestamp)) % F.lit(time_index_hour_length_second)) <= hour_delay_second)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "adjacent-interest",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "|s2_area|s2_geometry|s2_id|s2_timestamp|s2_timestamp_date|time_index|\n",
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_time_index_early.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "offensive-delaware",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "intense-character",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_time_index_late = sentinel_sedona_time_index.filter(((F.unix_timestamp('s2_timestamp_date') - F.lit(start_date_timestamp)) % F.lit(time_index_hour_length_second)) >= (time_index_hour_length_second - hour_delay_second))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "capable-airline",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "|s2_area|s2_geometry|s2_id|s2_timestamp|s2_timestamp_date|time_index|\n",
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "+-------+-----------+-----+------------+-----------------+----------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_time_index_late.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ambient-resolution",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "stupid-democrat",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_dateforjoin = sentinel_sedona_time_index.withColumn('time_index_join', F.col('time_index')).union(sentinel_sedona_time_index_late.withColumn('time_index_join', (F.col('time_index') + 1))).union(sentinel_sedona_time_index_early.withColumn('time_index_join', (F.col('time_index') - 1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "id": "basic-assumption",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|  s2_timestamp_date|time_index|time_index_join|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|2018-02-02 11:37:55|        32|             32|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "id": "dried-complement",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "162467"
      ]
     },
     "execution_count": 67,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "id": "certain-father",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "162467"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin.drop_duplicates(subset=['s2_id']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "revised-hunter",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "russian-bikini",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "religious-republican",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "resident-seven",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "restricted-russell",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "id": "conventional-opposition",
   "metadata": {},
   "source": [
    "## Third, spatio-temporal join Sentinel dataset and L8 dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "ancient-uzbekistan",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 3.1 join the two Sedona dataframe according the timestamps, hour level at now, consider time delay"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "id": "prescribed-calendar",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin = sentinel_sedona_dateforjoin.withColumn('sentinel_max_timestamp', (F.unix_timestamp(\"s2_timestamp_date\") + hour_delay * 3600)).\\\n",
    "withColumn('sentinel_min_timestamp', (F.unix_timestamp(\"s2_timestamp_date\") - hour_delay * 3600))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "id": "traditional-idaho",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+----------------------+----------------------+\n",
      "|             s2_area|         s2_geometry|               s2_id|        s2_timestamp|  s2_timestamp_date|time_index|time_index_join|sentinel_max_timestamp|sentinel_min_timestamp|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+----------------------+----------------------+\n",
      "|  0.9353367184743338|POLYGON ((-6.5921...|20171226T094359_2...|2018-02-02 11:37:...|2018-02-02 11:37:55|        32|             32|            1517611075|            1517567875|\n",
      "|    0.04975974010451|POLYGON ((33.1015...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|0.028138156238690933|POLYGON ((34.1301...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.9896346515425708|POLYGON ((34.1465...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.8891776385157389|POLYGON ((34.1574...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "| 0.22045620916115177|POLYGON ((35.1594...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1296895785175753|POLYGON ((35.1689...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1403158483431393|POLYGON ((34.0543...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.3490307087263577|POLYGON ((35.0638...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.0561604589873097|POLYGON ((36.0089...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1389331101552205|POLYGON ((35.1079...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.6674468416368683|POLYGON ((33.3628...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.4433850315383707|POLYGON ((34.1677...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|   0.216942383203032|POLYGON ((34.1008...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1511821013349017|POLYGON ((34.0647...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1625541701612807|POLYGON ((34.0756...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.1741105497465945|POLYGON ((34.0746...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  1.0694739134405127|POLYGON ((35.3049...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.8494615591637208|POLYGON ((34.6001...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "|  0.6255690971259398|POLYGON ((35.3583...|20180101T082329_2...|2018-01-01 08:23:...|2018-01-01 08:23:29|         0|              0|            1514834609|            1514791409|\n",
      "+--------------------+--------------------+--------------------+--------------------+-------------------+----------+---------------+----------------------+----------------------+\n",
      "only showing top 20 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b996073d-d57e-44a5-89a9-b5b479323b4a",
   "metadata": {},
   "source": [
    "## temporal slicing index is not using in the ISAT strategy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "id": "furnished-translator",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_sedona = l8_df_sedona.withColumn('time_index_join', F.floor((F.unix_timestamp('l8_timestamp_date') - F.lit(start_date_timestamp)) / F.lit(time_index_hour_length_second)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "id": "proved-landscape",
   "metadata": {},
   "outputs": [],
   "source": [
    "l8_df_sedona = l8_df_sedona.withColumn('l8_timestamp', F.unix_timestamp(F.col('l8_timestamp_date')))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "id": "found-climate",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- l8_area: double (nullable = false)\n",
      " |-- l8_geometry: geometry (nullable = false)\n",
      " |-- l8_img_id: string (nullable = true)\n",
      " |-- l8_timestep: timestamp (nullable = true)\n",
      " |-- l8_timestamp_date: timestamp (nullable = true)\n",
      " |-- time_index_join: long (nullable = true)\n",
      " |-- l8_timestamp: long (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "l8_df_sedona.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "id": "twenty-process",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- s2_area: double (nullable = false)\n",
      " |-- s2_geometry: geometry (nullable = false)\n",
      " |-- s2_id: string (nullable = true)\n",
      " |-- s2_timestamp: timestamp (nullable = true)\n",
      " |-- s2_timestamp_date: timestamp (nullable = true)\n",
      " |-- time_index: long (nullable = true)\n",
      " |-- time_index_join: long (nullable = true)\n",
      " |-- sentinel_max_timestamp: long (nullable = true)\n",
      " |-- sentinel_min_timestamp: long (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "id": "sweet-moscow",
   "metadata": {},
   "outputs": [],
   "source": [
    "sentinel_sedona_dateforjoin_maxmin.createOrReplaceTempView(\"sentinel_sedona_dateforjoin_maxmin\")\n",
    "l8_df_sedona.createOrReplaceTempView(\"l8_df_sedona\")\n",
    "test_temporal_indexed_spatial_join = spark.sql(\"SELECT * FROM sentinel_sedona_dateforjoin_maxmin, l8_df_sedona WHERE ST_Intersects(sentinel_sedona_dateforjoin_maxmin.s2_geometry, l8_df_sedona.l8_geometry)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "velvet-hearing",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_temporal_indexed_spatial_join.explain()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "id": "rocky-world",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import time\n",
    "# start_spatial_time = time.time()\n",
    "# test_spatial_join.select('uuid', 'Description', 'is_timestamp').write.format('com.databricks.spark.csv')\\\n",
    "#         .mode(\"overwrite\")\\\n",
    "#         .option('header', True)\\\n",
    "#         .save('sentinel_is2_intersect_pure_spatial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "transsexual-distributor",
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open('speed_pure_spatial_join_month.txt', 'w') as f:\n",
    "#     f.write('pure spatial join time:')\n",
    "#     f.write(str(time.time() - start_spatial_time))\n",
    "#     f.write('/n')\n",
    "#     f.write('number of spatial intersections:')\n",
    "#     f.write(str(test_spatial_join.count()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "id": "hairy-market",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- s2_area: double (nullable = false)\n",
      " |-- s2_geometry: geometry (nullable = false)\n",
      " |-- s2_id: string (nullable = true)\n",
      " |-- s2_timestamp: timestamp (nullable = true)\n",
      " |-- s2_timestamp_date: timestamp (nullable = true)\n",
      " |-- time_index: long (nullable = true)\n",
      " |-- time_index_join: long (nullable = true)\n",
      " |-- sentinel_max_timestamp: long (nullable = true)\n",
      " |-- sentinel_min_timestamp: long (nullable = true)\n",
      " |-- l8_area: double (nullable = false)\n",
      " |-- l8_geometry: geometry (nullable = false)\n",
      " |-- l8_img_id: string (nullable = true)\n",
      " |-- l8_timestep: timestamp (nullable = true)\n",
      " |-- l8_timestamp_date: timestamp (nullable = true)\n",
      " |-- time_index_join: long (nullable = true)\n",
      " |-- l8_timestamp: long (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test_temporal_indexed_spatial_join.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "id": "textile-vaccine",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_ST_join = test_temporal_indexed_spatial_join.filter(F.col('sentinel_max_timestamp') >= F.col('l8_timestamp')).filter(F.col('sentinel_min_timestamp') <= F.col('l8_timestamp'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "id": "perfect-pendant",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join.explain()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "id": "registered-settle",
   "metadata": {},
   "outputs": [],
   "source": [
    "import time\n",
    "start_temporal_time = time.time()\n",
    "test_ST_join.select('s2_id', 'l8_img_id', 's2_timestamp', 'l8_timestamp').write.format('com.databricks.spark.csv')\\\n",
    "        .mode(\"overwrite\")\\\n",
    "        .option('header', True)\\\n",
    "        .save('sentinel_L8_intersect_spatial_temporal')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "common-donor",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('ISAT_{}_months_{}_hour_delay_S2L8_full.txt'.format(str(duration_month), str(hour_delay)), 'w') as f:\n",
    "    f.write('temporal index first join time in total:')\n",
    "    f.write(str(time.time() - start_temporal_time))\n",
    "    f.write('\\n')\n",
    "    f.write('number of spatial intersections:')\n",
    "    f.write(str(test_ST_join.count()))\n",
    "    f.write('\\n')\n",
    "    f.write('from and to date')\n",
    "    f.write(start_date)\n",
    "    f.write('\\n')\n",
    "    f.write(end_date)\n",
    "    f.write('\\n')\n",
    "    f.write('L8 data count:')\n",
    "    f.write(str(l8_df_sedona.count()))\n",
    "    f.write('\\n')\n",
    "\n",
    "    f.write('Sentinel 2 data count:')\n",
    "    f.write(str(s2_df_sedona.count()))\n",
    "    f.write('\\n')\n",
    "    f.write('hour_delay:')\n",
    "    f.write(str(hour_delay))\n",
    "    \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "id": "sudden-leadership",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join.select('s2_index', 'Description', 'is_timestamp').drop_duplicates(subset=['s2_index', 'Description']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "auburn-picking",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "id": "written-decline",
   "metadata": {},
   "outputs": [],
   "source": [
    "# \"fhldsai{}_{}\".format(str(2), 'se')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "id": "stopped-tracy",
   "metadata": {},
   "outputs": [],
   "source": [
    "# boundaries in the format of polygon）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "id": "catholic-throat",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd = test_ST_join.toPandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "id": "junior-appreciation",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "id": "practical-ivory",
   "metadata": {},
   "outputs": [],
   "source": [
    "# gpd.GeoSeries(test_ST_join_area_pd['geometry']).buffer(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "id": "fatty-technical",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd['intersection'] = gpd.GeoSeries(test_ST_join_pd['s2_geometry']).buffer(0).intersection(gpd.GeoSeries(test_ST_join_pd['l8_geometry']).buffer(0), align=False)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "id": "eligible-exchange",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd['intersection_area'] = gpd.GeoSeries(test_ST_join_pd['intersection']).area\n",
    "# test_ST_join_pd['s2_pd_area'] = gpd.GeoSeries(test_ST_join_pd['s2_geometry']).area\n",
    "# test_ST_join_pd['l8_pd_area'] = gpd.GeoSeries(test_ST_join_pd['l8_geometry']).area"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "id": "choice-shannon",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd['intersection_percentage'] = test_ST_join_pd['intersection_area'] / test_ST_join_pd[['s2_pd_area', 's2_pd_area']].min(axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "id": "induced-chapter",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "id": "regular-information",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "id": "handmade-ratio",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_ST_join_pd[test_ST_join_pd['intersection_percentage'] > 0.5].shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "id": "yellow-daily",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 78130 / 364922"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "id": "crucial-nelson",
   "metadata": {},
   "outputs": [],
   "source": [
    "# s2_l8_ST_join_valid_true_pd[s2_l8_ST_join_valid_true_pd['intersection_area'] / s2_l8_ST_join_valid_true_pd['s2_pd_area'] > 0.9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "id": "elementary-retreat",
   "metadata": {},
   "outputs": [],
   "source": [
    "# test_area_intersect = s2_l8_ST_join_valid_true_pd[s2_l8_ST_join_valid_true_pd['intersection_area'] / s2_l8_ST_join_valid_true_pd['s2_pd_area'] > 0.9]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "id": "saving-elevation",
   "metadata": {},
   "outputs": [],
   "source": [
    "# with open('ICESpark_temporal_indexed_join_{}_months_{}_hour_delay_S2L8.txt'.format(str(duration_month), str(hour_delay)), 'a+') as f:\n",
    "#     f.write('percentage of intersection area larger than 50% of the smaller RS image')\n",
    "#     f.write('\\n')\n",
    "#     f.write(str(test_ST_join_pd[test_ST_join_pd['intersection_percentage'] > 0.5].shape[0] / test_ST_join_pd.shape[0]))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "supposed-prague",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
